{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 0. Import Data, Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import f1_score\n",
    "\n",
    "meta_df = pd.read_csv('metadata-rh-np.csv', index_col = 'HATHI')\n",
    "dtm_df = pd.read_csv('data-rh-np.csv', index_col = 'HATHI')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add column with Boolean values we'll use for the regression\n",
    "\n",
    "meta_df['CLASS'] = meta_df['PUBLISHER'] != \"RANDOM HOUSE\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Additional Processing Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Metadata Processing\n",
    "\n",
    "def author_max(meta_, thresh = 2):\n",
    "    # Downsample novels by author\n",
    "    # thresh = maximum novels per author\n",
    "    \n",
    "    np_auth_array = meta_[meta_['CLASS']]['AUTHOR'].unique()\n",
    "    rh_auth_array = meta_[~meta_['CLASS']]['AUTHOR'].unique()\n",
    "    \n",
    "    np_sub_index = [meta_[ (meta_['CLASS']) & (meta_['AUTHOR']==auth) ].index.tolist() for auth in np_auth_array]\n",
    "    np_sub_index = [index_ for index_list in np_sub_index\n",
    "                    for index_ in np.random.choice( index_list, min( thresh, len(index_list) ), replace = False ) ]\n",
    "    \n",
    "    rh_sub_index = [meta_[ (~meta_['CLASS']) & (meta_['AUTHOR']==auth) ].index.tolist() for auth in rh_auth_array]\n",
    "    rh_sub_index = [index_ for index_list in rh_sub_index\n",
    "                    for index_ in np.random.choice( index_list, min( thresh, len(index_list) ), replace = False ) ]\n",
    "\n",
    "    return meta_.loc[np_sub_index + rh_sub_index]\n",
    "\n",
    "def bootstrap_set(meta_):\n",
    "    # Re-sample novels to generate bootstrap confidence intervals\n",
    "    \n",
    "    # Downsample authors to prevent prolific writers from dominating the model\n",
    "    # Ensure that number of books from each class is equal\n",
    "    \n",
    "    # Return list of books that are in-sample and out-of-sample\n",
    "    \n",
    "    meta_ = author_max(meta_)\n",
    "    class_size = meta_[meta_['CLASS']].shape[0]\n",
    "    \n",
    "    sample_np_index = meta_[meta_['CLASS']].sample(class_size, replace = True).index.tolist()\n",
    "    sample_np_auths = meta_.loc[sample_np_index]['AUTHOR'].unique()\n",
    "    \n",
    "    sample_rh_index = meta_[~meta_['CLASS']].sample(class_size, replace = True).index.tolist()\n",
    "    sample_rh_auths = meta_.loc[sample_rh_index]['AUTHOR'].unique()\n",
    "    \n",
    "    sample_all_auths = list(sample_np_auths) + list(sample_rh_auths)\n",
    "    \n",
    "    oos_np_index = meta_[( meta_['CLASS']) & (~meta_['AUTHOR'].isin(sample_all_auths))].index.tolist()\n",
    "    oos_rh_index = meta_[(~meta_['CLASS']) & (~meta_['AUTHOR'].isin(sample_all_auths))].index.tolist()\n",
    "    \n",
    "    oos_rh_index = list(np.random.choice(oos_rh_index, size = len(oos_np_index), replace = False))\n",
    "    \n",
    "    return sample_np_index + sample_rh_index, oos_np_index + oos_rh_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data Processing\n",
    "\n",
    "def top_n_features(dtm_, top_n = 5000):\n",
    "    # Takes a document-term matrix;\n",
    "    # Returns list of top_n most frequent terms, descending order\n",
    "    \n",
    "    return dtm_.sum().sort_values()[::-1][:top_n].index.tolist()\n",
    "\n",
    "def normalize(train_dtm_, test_dtm_):\n",
    "    # Takes a document-term matrix with raw frequencies;\n",
    "    # Returns document-term matrix normalized\n",
    "    # row-wise by l1-norm, then column-wise in standard units\n",
    "    \n",
    "    # Columns of test_dtm_ are normalized according to the\n",
    "    # mean and standard deviation of columns in train_dtm_\n",
    "    \n",
    "    train_dtm_ = train_dtm_.div(train_dtm_.sum(axis=1), axis = 'rows')\n",
    "    test_dtm_ = test_dtm_.div(test_dtm_.sum(axis=1), axis = 'rows')\n",
    "    \n",
    "    train_mean, train_std = train_dtm_.mean(), train_dtm_.std()\n",
    "    \n",
    "    train_dtm_ = ( train_dtm_ - train_mean ) / train_std\n",
    "    test_dtm_ = ( test_dtm_ - train_mean ) / train_std\n",
    "    \n",
    "    return train_dtm_, test_dtm_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data Analysis\n",
    "\n",
    "def get_percentile(list_of_floats, pct):\n",
    "    \n",
    "    size = len(list_of_floats)\n",
    "    if size > 0:\n",
    "        list_of_floats = sorted(list_of_floats)\n",
    "        return list_of_floats[int(pct*size)]\n",
    "    else:\n",
    "        return np.nan"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Model Training & Prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optimal Parameters: Vocabulary Size, Regularization Constant\n",
    "# Learned from five-fold cross validation\n",
    "\n",
    "opt_vocab = 5000\n",
    "opt_reg = 0.001\n",
    "\n",
    "\n",
    "# Number of iterations to bootstrap model\n",
    "\n",
    "boot_iters = 2000\n",
    "\n",
    "\n",
    "# Store values from model: text predictions, model accuracy, model coefficients\n",
    "\n",
    "prediction_by_index = {index_:[] for index_ in meta_df.index.tolist()}\n",
    "f1s = []\n",
    "coefs, coef_count = {}, {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The Model\n",
    "\n",
    "for k in range(boot_iters):\n",
    "    \n",
    "    # Split dtm_df into separate dataframes with\n",
    "    # in-sample (training), out-of-sample (test) values\n",
    "    sample_index, oos_index = bootstrap_set(meta_df)\n",
    "    sample_targets, oos_targets = meta_df.loc[sample_index]['CLASS'].astype(int), meta_df.loc[oos_index]['CLASS'].astype(int)\n",
    "    sample_dtm, oos_dtm = dtm_df.loc[sample_index], dtm_df.loc[oos_index]\n",
    "\n",
    "    \n",
    "    # Reduce size of DTMs to optimal number of features;\n",
    "    # Normalize values in in-sample, out-of-sample DTMs\n",
    "    # (Note that the list of top_n features will be slightly different each iteration)\n",
    "    sample_vocab = top_n_features(sample_dtm, top_n = opt_vocab)\n",
    "    sample_dtm, oos_dtm = sample_dtm[sample_vocab], oos_dtm[sample_vocab]\n",
    "    sample_dtm, oos_dtm = normalize(sample_dtm, oos_dtm)\n",
    "\n",
    "    \n",
    "    # Fit logistic regression on in-sample data\n",
    "    # Predict classes on out-of-sample data\n",
    "    lr = LogisticRegression(C = opt_reg)\n",
    "    lr.fit(sample_dtm, sample_targets)\n",
    "    oos_predictions = lr.predict_proba(oos_dtm)[:,1]\n",
    "    \n",
    "    \n",
    "    # Report overall accuracy as f1-score\n",
    "    f1s.append( f1_score(oos_targets,oos_predictions > 0.5))\n",
    "    \n",
    "    \n",
    "    # Save predictions for each out-of-sample novel\n",
    "    for m,index_ in enumerate(oos_index):\n",
    "        prediction_by_index[index_].append( oos_predictions[m] )\n",
    "    \n",
    "    \n",
    "    # Save regression coefficients for each token\n",
    "    # Use a counter to track frequency of token's inclusion in model\n",
    "    for n,token in enumerate(sample_vocab):\n",
    "        try:\n",
    "            coef_count[token] += 1\n",
    "            coefs[token][k] = lr.coef_[0][n]\n",
    "        except:\n",
    "            coef_count[token] = 1\n",
    "            coefs[token] = [0] * boot_iters\n",
    "            coefs[token][k] = lr.coef_[0][n]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Accuracy Report\n",
    "\n",
    "# F1-Score, with lower/upper bounds on 95% confidence interval\n",
    "# Statistical Significance: if lower bound > 0.5, then p < 0.05\n",
    "\n",
    "np.mean(f1s), get_percentile(f1s, 0.025), get_percentile(f1s, 0.975)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Export Predictions\n",
    "\n",
    "# Reports mean prediction for each text over all boostrapped iterations\n",
    "# (equivalent to the prediction on the text under leave-one-out cross validation)\n",
    "\n",
    "# Reports lower, upper bounds on 95% interval for each prediction\n",
    "\n",
    "meta_df['Predicted Probability'] = [np.mean(prediction_by_index[index_]) for index_ in meta_df.index.tolist()]\n",
    "meta_df['Lower Bound (95%-Interval)'] = [get_percentile(prediction_by_index[index_], 0.025) for index_ in meta_df.index.tolist()]\n",
    "meta_df['Upper Bound (95%-Interval)'] = [get_percentile(prediction_by_index[index_], 0.975) for index_ in meta_df.index.tolist()]\n",
    "\n",
    "meta_df.to_csv('replicated-predictions.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Export Model\n",
    "\n",
    "# Reports mean coefficient for each token over all bootstrapped iterations\n",
    "# (equivalent to the coefficients for a model fitted on the full corpus)\n",
    "\n",
    "# Reports lower, upper bounds on 95% interval for each coefficient\n",
    "\n",
    "coef_df = pd.DataFrame()\n",
    "token_list = [k for k,v in sorted(coef_count.items(), key = lambda item: item[1], reverse = True)[:opt_vocab]]\n",
    "\n",
    "coef_df['Token'] = token_list\n",
    "coef_df['Regression Coefficient'] = [ np.mean(coefs[key_]) for key_ in token_list ]\n",
    "coef_df['Lower Bound (95%-Interval)'] = [ get_percentile(coefs[key_], 0.025) for key_ in token_list ]\n",
    "coef_df['Upper Bound (95%-Interval)'] = [ get_percentile(coefs[key_], 0.975) for key_ in token_list ]\n",
    "\n",
    "coef_df.sort_values('Regression Coefficient', ascending = False, inplace = True)\n",
    "coef_df.to_csv('replicated-model.csv', index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
